{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import codecs  # for unicode encoding\n",
    "import glob  # for reading files\n",
    "import multiprocessing   # to run your model faster\n",
    "import os  # access the filesystem\n",
    "import re  # to run regular expression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import nltk\n",
    "import gensim.models.word2vec as w2v\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# open your files\n",
    "\n",
    "book_filenames = sorted(glob.glob(\"data/*.txt\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['data/got1.txt',\n",
       " 'data/got2.txt',\n",
       " 'data/got3.txt',\n",
       " 'data/got4.txt',\n",
       " 'data/got5.txt']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "book_filenames"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus length 1770659\n",
      "Corpus length 4071041\n",
      "Corpus length 6391405\n",
      "Corpus length 8107945\n",
      "Corpus length 9719485\n"
     ]
    }
   ],
   "source": [
    "# STEP 1\n",
    "\n",
    "# combine all books into one large corpus\n",
    "\n",
    "corpus_raw = u\"\"\n",
    "\n",
    "# read every file and append it to corpus raw\n",
    "\n",
    "for book in book_filenames:\n",
    "    with codecs.open(book,\"r\",\"utf-8\") as book_file:\n",
    "        corpus_raw += book_file.read()\n",
    "    print(\"Corpus length {0}\".format(len(corpus_raw)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /Users/abhishek/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to\n",
      "[nltk_data]     /Users/abhishek/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# download certain helper functions from nltk\n",
    "\n",
    "nltk.download(\"punkt\")\n",
    "nltk.download(\"stopwords\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create a tokenizer\n",
    "\n",
    "tokenizer = nltk.data.load(\"tokenizers/punkt/english.pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# STEP 2\n",
    "\n",
    "# split my corpus into sentences using this tokenizer\n",
    "\n",
    "raw_sentences = tokenizer.tokenize(corpus_raw)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "128868"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(raw_sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'It was here the ravens came, after long flight.'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_sentences[10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# STEP 3 - cleaning your data\n",
    "\n",
    "# remove non letters and split into words\n",
    "\n",
    "def sentence_to_wordlist(raw):\n",
    "    clean = re.sub(\"[^a-zA-Z]\",\" \", raw)\n",
    "    words = clean.split()\n",
    "    return words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# split raw sentences\n",
    "\n",
    "sentences = []\n",
    "\n",
    "for raw in raw_sentences:\n",
    "    if len(raw)>0:\n",
    "        sentences.append(sentence_to_wordlist(raw))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'It was here the ravens came, after long flight.'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_sentences[10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['It', 'was', 'here', 'the', 'ravens', 'came', 'after', 'long', 'flight']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentences[10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1818103"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "token_count = sum([len(sentence) for sentence in sentences])\n",
    "token_count"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# train word 2 vec\n",
    "\n",
    "word2vec = w2v.Word2Vec(\n",
    "    sg=1,\n",
    "    seed=1,\n",
    "    workers= multiprocessing.cpu_count(),\n",
    "    size=300,\n",
    "    min_count=3,\n",
    "    window=7,\n",
    "    sample=1e-3\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "word2vec.build_vocab(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17277"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(word2vec.wv.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7021780"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.train(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'word2vec' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-2-24f281e03886>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mword2vec\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmost_similar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Stark\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m: name 'word2vec' is not defined"
     ]
    }
   ],
   "source": [
    "word2vec.wv.most_similar(\"Stark\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('Lannister', 1.1300432682037354),\n",
       " ('Tyrion', 1.118436336517334),\n",
       " ('Kevan', 1.1168137788772583),\n",
       " ('Lancel', 1.0900390148162842),\n",
       " ('Tywin', 1.0680841207504272),\n",
       " ('Sansa', 1.0382449626922607),\n",
       " ('Margaery', 1.0335110425949097),\n",
       " ('Joff', 1.0157004594802856),\n",
       " ('Joffrey', 1.015446424484253),\n",
       " ('dwarf', 1.0100629329681396)]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "word2vec.wv.most_similar_cosmul(positive=[\"\",\"woman\"], negative=[\"man\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
